full_data <- readRDS('data/full_data.rds')

Compute Daily Max H2S

H2S_dm <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 501 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1215: `day = 2020-07-12`, `Monitor = "First Methodist"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 500 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
  mutate(yearmonth = format(day, "%Y-%m")) %>%
  group_by(yearmonth, Monitor) %>%
  summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
  scale_y_continuous(limits=c(0, 50)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')

The line with the largest deviation/peak corresponds to the 213th&Chico monitor, in 2021-10 Maybe we should start from January 2022 to start modelling to remove the extreme values, or simply removing the 213 monitor since that’s set up specifically for the event.

full_data %>%
  polarPlot(pollutant = "H2S", col = "turbo", 
            key.position = "bottom",
            key.header = "mean H2S", 
            key.footer = NULL, title = 'hi')

# Compute daily average wd/ws
daily_full <- full_data %>%
              select(c(H2S, ws, wd, latitude, longitude, Monitor, MinDist, Converted_Angle, Refinery, year, month, day)) %>%
  group_by(Monitor, day, latitude, longitude, Refinery, Converted_Angle, year, month, MinDist) %>%
  summarise(H2S_daily_max = max(H2S, na.rm=TRUE),
            H2S_daily_avg = mean(H2S, na.rm=TRUE),
            ws_avg = mean(ws, na.rm=TRUE),
            wd_avg = mean(wd, na.rm=TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 3681 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 453: `Monitor = "First Methodist"`, `day = 2020-07-09`, `latitude =
##   NA`, `longitude = NA`, `Refinery = "Phillips 66 (Wilmington)"`,
##   `Converted_Angle = 209`, `year = "2020"`, `month = "07"`, `MinDist =
##   2119.191`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 3680 remaining warnings.
## `summarise()` has grouped output by 'Monitor', 'day', 'latitude', 'longitude',
## 'Refinery', 'Converted_Angle', 'year', 'month'. You can override using the
## `.groups` argument.

Explore some GAM models

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        9.091e-01  1.084e+00   0.839   0.4016    
## year2021                           3.076e-01  1.082e+00   0.284   0.7763    
## year2022                          -7.302e-01  1.083e+00  -0.675   0.5000    
## wd_secQ2                           8.995e-02  4.755e-02   1.892   0.0585 .  
## wd_secQ3                           8.260e-01  5.171e-02  15.974  < 2e-16 ***
## wd_secQ4                           4.469e-01  4.417e-02  10.118  < 2e-16 ***
## ws                                -9.682e-02  5.944e-03 -16.290  < 2e-16 ***
## I(1/MinDist^2)                     2.222e+04  3.730e+04   0.596   0.5514    
## RefineryMarathon (Carson)          2.989e+00  5.907e-02  50.603  < 2e-16 ***
## RefineryMarathon (Wilmington)      8.388e-01  6.456e-02  12.993  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -5.341e-01  5.964e-02  -8.955  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)   3.880e-01  6.332e-02   6.127 8.94e-10 ***
## RefineryTorrance Refinery          8.476e-02  6.389e-02   1.327   0.1846    
## RefineryValero Refinery            5.988e-01  6.286e-02   9.526  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F p-value    
## s(as.numeric(month)) 7.996      8 881.4  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00848   Deviance explained = 0.849%
## GCV = 336.53  Scale est. = 336.52    n = 1730935
plot(h2s_model_a)

h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
                   Refinery+s(latitude, longitude, bs='tp', k = 5), 
                   data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) + 
##     Refinery + s(latitude, longitude, bs = "tp", k = 5)
## 
## Parametric coefficients:
##                                     Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                        3.3701574  0.0568743   59.256  < 2e-16 ***
## wd_secQ2                          -0.1223242  0.0032107  -38.098  < 2e-16 ***
## wd_secQ3                          -0.0276010  0.0035146   -7.853 4.06e-15 ***
## wd_secQ4                           0.0497316  0.0030516   16.297  < 2e-16 ***
## ws                                -0.0379057  0.0003735 -101.491  < 2e-16 ***
## I(1/MinDist^2)                     0.0005294  0.0000120   44.128  < 2e-16 ***
## RefineryMarathon (Carson)         -2.3029517  0.0477069  -48.273  < 2e-16 ***
## RefineryMarathon (Wilmington)     -2.7010960  0.0622682  -43.378  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -3.5993158  0.0805031  -44.710  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)  -3.2913878  0.0854036  -38.539  < 2e-16 ***
## RefineryTorrance Refinery         -0.2786949  0.0148328  -18.789  < 2e-16 ***
## RefineryValero Refinery           -3.3474680  0.0792409  -42.244  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df    F p-value    
## s(as.numeric(month))  7.998      8 1740  <2e-16 ***
## s(latitude,longitude) 2.000      2 2368  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 23/24
## R-sq.(adj) =  0.102   Deviance explained = 10.2%
## GCV = 0.64572  Scale est. = 0.6457    n = 769068
plot(h2s_model_b)

h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_sec + 
##     ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        5.024e+00  5.659e+00   0.888  0.37466    
## year2021                           1.330e+00  5.651e+00   0.235  0.81393    
## year2022                          -8.367e+00  5.652e+00  -1.480  0.13878    
## wd_secQ2                          -1.341e+00  2.445e-01  -5.484 4.16e-08 ***
## wd_secQ3                           3.643e+00  2.663e-01  13.679  < 2e-16 ***
## wd_secQ4                          -2.626e-01  2.275e-01  -1.154  0.24832    
## ws                                 4.156e-02  3.054e-02   1.361  0.17364    
## I(1/MinDist^2)                     9.242e+04  1.926e+05   0.480  0.63123    
## RefineryMarathon (Carson)          2.569e+01  3.040e-01  84.515  < 2e-16 ***
## RefineryMarathon (Wilmington)      4.007e+00  3.327e-01  12.043  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -8.985e-01  3.075e-01  -2.922  0.00348 ** 
## RefineryPhillips 66 (Wilmington)   3.154e+00  3.267e-01   9.656  < 2e-16 ***
## RefineryTorrance Refinery          8.734e-01  3.292e-01   2.653  0.00799 ** 
## RefineryValero Refinery            5.897e+00  3.242e-01  18.185  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.998      8 2750  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0229   Deviance explained =  2.3%
## GCV = 9173.7  Scale est. = 9173.6    n = 1777112
plot(h2s_dm_model_a)

h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_sec+ws+
                        I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=5), 
                      data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + 
##     I(1/MinDist^2) + Refinery + s(latitude, longitude, bs = "tp", 
##     k = 5)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        1.556e+01  4.284e-01  36.324  < 2e-16 ***
## wd_secQ2                          -3.802e-01  1.199e-02 -31.723  < 2e-16 ***
## wd_secQ3                          -2.731e-01  1.312e-02 -20.813  < 2e-16 ***
## wd_secQ4                          -3.293e-02  1.139e-02  -2.891  0.00384 ** 
## ws                                -6.881e-02  1.394e-03 -49.358  < 2e-16 ***
## I(1/MinDist^2)                     2.630e-03  8.341e-05  31.531  < 2e-16 ***
## RefineryMarathon (Carson)         -1.115e+01  4.713e-01 -23.652  < 2e-16 ***
## RefineryMarathon (Wilmington)     -1.387e+01  4.938e-01 -28.095  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -1.860e+01  5.276e-01 -35.248  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)  -1.830e+01  5.416e-01 -33.783  < 2e-16 ***
## RefineryTorrance Refinery         -2.225e+00  3.238e-01  -6.873 6.31e-12 ***
## RefineryValero Refinery           -1.383e+01  5.266e-01 -26.269  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df    F p-value    
## s(as.numeric(month))  7.965      8 2801  <2e-16 ***
## s(latitude,longitude) 2.000      2 1573  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 23/24
## R-sq.(adj) =  0.256   Deviance explained = 25.6%
## GCV = 8.9977  Scale est. = 8.9974    n = 769068
plot(h2s_dm_model_b)

# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
                        I(1/MinDist^2)+Refinery+Converted_Angle+
                        s(latitude, longitude, bs='tp', k=5), 
                      data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg + 
##     I(1/MinDist^2) + Refinery + Converted_Angle + s(latitude, 
##     longitude, bs = "tp", k = 5)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.212e+01  3.794e+00   5.830 6.18e-09 ***
## wd_avg                             4.190e-03  1.536e-03   2.727 0.006426 ** 
## ws_avg                            -5.292e-01  5.738e-02  -9.223  < 2e-16 ***
## I(1/MinDist^2)                     2.581e-03  5.113e-04   5.047 4.77e-07 ***
## RefineryMarathon (Carson)          1.916e+00  6.758e-01   2.836 0.004602 ** 
## RefineryMarathon (Wilmington)     -5.750e+00  1.035e+00  -5.556 3.01e-08 ***
## RefineryPhillips 66 (Wilimington) -2.133e+01  4.006e+00  -5.324 1.10e-07 ***
## RefineryPhillips 66 (Wilmington)  -2.374e+01  4.854e+00  -4.892 1.05e-06 ***
## RefineryTorrance Refinery          1.102e+01  2.159e+00   5.104 3.54e-07 ***
## RefineryValero Refinery           -9.947e+00  2.717e+00  -3.660 0.000257 ***
## Converted_Angle                   -4.559e-02  9.494e-03  -4.801 1.66e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value    
## s(as.numeric(month))  5.160  8.000  6.37  <2e-16 ***
## s(latitude,longitude) 1.895  1.895 69.03  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 21/23
## R-sq.(adj) =  0.269   Deviance explained = 27.3%
## GCV = 9.0112  Scale est. = 8.959     n = 2791
plot(h2s_dm_model_c)

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        4.562e+00  4.624e+00   0.987 0.323830    
## year2021                           1.076e+00  4.618e+00   0.233 0.815796    
## year2022                          -8.169e+00  4.618e+00  -1.769 0.076930 .  
## wd_secQ2                          -7.359e-01  1.981e-01  -3.714 0.000204 ***
## wd_secQ3                           3.097e+00  2.157e-01  14.360  < 2e-16 ***
## wd_secQ4                          -7.004e-01  1.846e-01  -3.794 0.000148 ***
## ws                                 1.185e-01  2.461e-02   4.815 1.47e-06 ***
## I(1/MinDist^2)                     2.964e+04  1.569e+05   0.189 0.850144    
## RefineryMarathon (Carson)          2.395e+01  2.453e-01  97.616  < 2e-16 ***
## RefineryMarathon (Wilmington)      4.224e+00  2.706e-01  15.613  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -2.615e-02  2.499e-01  -0.105 0.916656    
## RefineryPhillips 66 (Wilmington)   3.524e+00  2.658e-01  13.262  < 2e-16 ***
## RefineryTorrance Refinery          1.570e+00  2.636e-01   5.956 2.58e-09 ***
## RefineryValero Refinery            6.305e+00  2.632e-01  23.960  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.999      8 3996  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0307   Deviance explained = 3.07%
## GCV = 6125.1  Scale est. = 6125      n = 1814982
plot(h2s_ma_model_a)

h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), data = full_data)
summary(h2s_ma_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude, longitude, 
##     bs = "tp", k = 10)
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -1.223e+03  1.439e+01 -85.009  < 2e-16 ***
## year2021                           4.242e+00  4.442e+00   0.955 0.339588    
## year2022                          -5.266e-01  4.442e+00  -0.119 0.905646    
## wd_secQ2                           4.182e-01  1.951e-01   2.143 0.032118 *  
## wd_secQ3                           7.243e-01  2.123e-01   3.411 0.000647 ***
## wd_secQ4                          -1.549e-01  1.814e-01  -0.854 0.393162    
## ws                                 4.626e-02  2.439e-02   1.896 0.057917 .  
## I(1/MinDist^2)                     5.255e+07  4.687e+05 112.116  < 2e-16 ***
## RefineryMarathon (Carson)          9.406e+02  1.506e+01  62.463  < 2e-16 ***
## RefineryMarathon (Wilmington)      1.232e+03  1.580e+01  77.956  < 2e-16 ***
## RefineryPhillips 66 (Wilimington)  1.645e+03  1.577e+01 104.304  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)   1.722e+03  1.614e+01 106.675  < 2e-16 ***
## RefineryTorrance Refinery          2.228e+02  1.037e+01  21.490  < 2e-16 ***
## RefineryValero Refinery            1.615e+03  1.703e+01  94.818  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value    
## s(as.numeric(month))    8      8  5187  <2e-16 ***
## s(latitude,longitude)   2      2 15468  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.144   Deviance explained = 14.4%
## GCV =   5667  Scale est. = 5666.9    n = 1730935
plot(h2s_ma_model_b)

Try XGBoost on Daily Max

daily_full <- daily_full %>%
  mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
         Monitor = str_replace_all(Refinery, ' ', '_'))

## Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200),
                         max_depth = c(2, 3, 4),
                         eta = c(0.1, 0.3),
                         gamma = c(0.01, 0.001),
                         colsample_bytree = c(0.5, 1),
                         min_child_weight = 0,
                         subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10,verboseIter=TRUE, search='grid')

fit.xgb <- train(H2S_daily_max~., 
                 method = 'xgbTree', 
                 data = fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>% 
                   ungroup() %>% 
                   filter(day >= '2022-02-01') %>% 
                   select(-c(year, day, H2S_daily_avg)), 
                   remove_selected_columns = TRUE)
                 ,
                 trControl=control,
                 tuneGrid = tune_grid,
                 tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
getTrainPerf(fit.xgb)
fit.xgb$finalModel
## ##### xgb.Booster
## raw: 116.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror", 
##     importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "3", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.5", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## # of features: 31 
## niter: 100
## nfeatures : 31 
## xNames : latitude longitude Converted_Angle MinDist ws_avg wd_avg Monitor_Chevron_El_Segundo Monitor_Marathon_Carson Monitor_Marathon_Wilmington Monitor_Phillips_66_Wilimington Monitor_Phillips_66_Wilmington Monitor_Torrance_Refinery Monitor_Valero_Refinery Refinery_Chevron_El_Segundo Refinery_Marathon_Carson Refinery_Marathon_Wilmington Refinery_Phillips_66_Wilimington Refinery_Phillips_66_Wilmington Refinery_Torrance_Refinery Refinery_Valero_Refinery month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 
## problemType : Regression 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 37     100         3 0.1  0.01              0.5                0       0.5
## obsLevels : NA 
## param :
##  $importance
## [1] TRUE
## 
## $verbosity
## [1] 0
## 
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb,scale=FALSE)
plot(imp, top=10)

SHAP for XGBoost above (Caret)

xgb.plot.shap(data = 
                fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>% 
                   ungroup() %>% 
                   filter(day >= '2022-02-01') %>% 
                   select(-c(year, day, H2S_daily_avg, H2S_daily_max)), 
                   remove_selected_columns = TRUE) %>% 
                as.matrix(), 
              model = fit.xgb$finalModel, top_n = 12, n_col = 3)
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = fastDummies::dummy_cols(daily_full[complete.cases(daily_full),] %>% 
                   ungroup() %>% 
                   filter(day >= '2022-02-01') %>% 
                   select(-c(year, day, H2S_daily_avg, H2S_daily_max)), 
                   remove_selected_columns = TRUE) %>% 
                as.matrix(), 
              model = fit.xgb$finalModel, top_n = 10)